Interactive Data Visualization

Automotive crashes continue to be one of the main reasons for American deaths. After seeing a decline in traffic fatalities for many years, 2015 saw an uptick in accidents. Many factors contributed to a higher number of accidents.

Import Packages

In [4]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

#Plotly is an online analytics and data visualization tool
import chart_studio.plotly as py
import plotly.graph_objects as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode
import plotly
plotly.offline.init_notebook_mode()

Load Traffic Accident Data

The dataset for 2015 US traffic accidents (accident.csv) is located in the ./Data folder

In [12]:
# read the csv file by choosing specific columns
accident_data = pd.read_csv('../EDA/Data/accident.csv', usecols=[0, 1, 8, 11, 12, 13, 25, 26, 28, 38, 50, 51])
# rename some columns 
accident_data = accident_data.rename(
    columns={'ST_CASE':'case_id', 'LONGITUD':'longitude', 'HARM_EV': 'harmful_event',
             'DRUNK_DR':'drunk_drivers'})
# capitalize column names 
accident_data.columns = accident_data.columns.str.capitalize()
# create a new column 'Date' by combining three original columns: 'Day', 'Month', 'Year'
accident_data['Date'] = pd.to_datetime(accident_data[['Day', 'Month', 'Year']])
# drop the columns: 'Day', 'Month', 'Year'
accident_data = accident_data.drop(['Day', 'Month', 'Year'], axis=1)
# reorder the columns 
accident_data = accident_data[['Case_id', 'Date', 'State', 'Latitude', 'Longitude', 'Weather', 'Harmful_event', 'Persons', 'Fatals', 'Drunk_drivers']]
# sort on date
accident_data = accident_data.sort_values('Date')
# show the first 5 rows
accident_data.head()
Out[12]:
Case_id Date State Latitude Longitude Weather Harmful_event Persons Fatals Drunk_drivers
0 10001 2015-01-01 1 33.878653 -87.325328 1 35 1 1 1
10269 170006 2015-01-01 17 38.740544 -89.481522 1 34 1 1 1
10315 170058 2015-01-01 17 42.401167 -88.184139 1 33 1 1 1
10530 170288 2015-01-01 17 41.902419 -87.745142 1 9 1 1 0
10551 170309 2015-01-01 17 41.827067 -88.201881 1 8 1 1 0

Number of Persons Involved in Traffic Accidents by Location

In [10]:
# add a new column 'Text', which will be displayed on the figure
accident_data['Text'] = accident_data['Date'].dt.strftime('%Y-%m-%d') + ', ' + accident_data['Persons'].astype(str) + ' involved'

# data to be displayed on the figure
data = [dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = accident_data[accident_data['Longitude'] < 0]['Longitude'],
        lat = accident_data[accident_data['Longitude'] < 0]['Latitude'],
        text = accident_data[accident_data['Longitude'] < 0]['Text'],
        mode = 'markers',
        # define the marker size, color, and style
        marker = dict( 
            size = accident_data[accident_data['Longitude'] < 0]['Drunk_drivers'] * 5,
            opacity = 0.8,
            color = 'rgb(255, 0, 250)')
        )]

# the layout of the figure
layout = dict(
         title = 'Number of Persons Involved in Traffic Accidents in USA in 2015<br>'
                 '<sub>Hover to view the details</sub>',
         geo = dict(
             scope = 'usa',
             projection = dict(type = 'albers usa'),
             showland = True,
             landcolor = 'rgb(250, 250, 250)',
             subunitwidth = 1,
             subunitcolor = 'rgb(217, 217, 217)',
             countrywidth = 1,
             countrycolor = 'rgb(217, 217, 217)',
             showlakes = True,
             lakecolor = 'rgb(255, 255, 255)')
         )

# show the figure
figure = dict(data = data, layout = layout)
iplot(figure)              

Number of Fatals Involved in Traffic Accidents by State

In [11]:
# US states
us_states = np.asarray(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
                        'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
                        'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
                        'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
                        'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'])

# total fatals per state
fatals_perstate = accident_data.groupby('State')['Fatals'].sum().values

# color scale 
color_scale = [[0, 'rgb(0, 255, 0)'], [1, 'rgb(255, 0, 0)']]

#A choropleth map is similar to heat map
data = [dict(
        type = 'choropleth',
        autocolorscale = False,
        colorscale = color_scale,
        showscale = False,
        locations = us_states,
        locationmode = 'USA-states',
        z = fatals_perstate,
        marker = dict(
            line = dict(
                color = 'rgb(200, 200, 200)',
                width = 3)
            )
        )]

layout = dict(
         title = 'Number of Fatals Involved in Traffic Accidents by State in USA in 2015<br>'
    '<sub>Hover to view the details</sub>',
         geo = dict(
             scope = 'usa',
             projection = dict(type = 'albers usa'),
             countrycolor = 'rgb(255, 255, 255)',
             showlakes = True,
             lakecolor = 'rgb(255, 255, 255)')
         )

figure = dict(data = data, layout = layout)
iplot(figure)

Number of Traffic Accidents by Date

In [13]:
# number of traffic accidents by date
accident_perdate = accident_data.groupby('Date')['Case_id'].count().values

# thirty day moving average of traffic fatalites by date
accident_average = pd.Series(accident_perdate).rolling(window=20).mean()

# drop the first 19 days, and then convert it to numpy array
accident_average = accident_average.drop(accident_average.index[:19]).values.round()

# numpy array of period of dates [2015-01-01, 2015-12-31]
accident_dates = np.arange('2015-01', '2016-01', dtype='datetime64[D]')

# numpy array of period of dates [2015-01-11, 2015-12-22]
accident_range = accident_dates[10:356]
fig = go.Figure()

# go == plotly.graph_objects
# scatter graph object #1

fig.add_trace(
  go.Scatter(
             x = accident_dates,
             y = accident_perdate,
             mode = 'lines',
             name = 'Fatalities',
             line = dict(
                 color = 'rgb(215, 0, 0)',
                 width = 3)
 ))

# scatter graph object #2

fig.add_trace(
  go.Scatter(
             x = accident_range,
             y = accident_average,
             mode = 'lines',
             name = 'Average',
             line = dict(
                 color = 'rgb(0, 0, 255)',
                 width = 5),
             opacity = 0.33
  ))
        
fig.show()

Number of Traffic Accidents by Weather Condition

In [14]:
# weather mapping
map_weather = {0: 'No additional atmospheric condition', 1:'Clear', 2:'Rain', 3:'Freezing rain or drizzle',
                     4:'Snow', 5:'Fog, smog, smoke', 6:'Severe crosswinds', 7:'Blowing sand, soil, dirt',
                     8:'Other', 10:'Cloudy', 11:'Blowing snow', 12:'Freezing Rain or Drizzle', 98:'Not reported', 99:'Unknown'}

# accidents count grouped by weather condition
accident_weather_counts = accident_data[accident_data['Weather']<15].groupby('Weather')['Case_id'].count().sort_values(ascending=False)
# x-axis displays the accident counts by weather condition
x_values = accident_weather_counts.values
# the percentage of accidents for each weather condition
weather_percent = np.round(x_values / sum(x_values) * 100, 2).astype(str)
weather_percent = np.array([s+'%' for s in weather_percent])

# y-axis displays the weather conditions in string
y_values = pd.Series(accident_weather_counts.index).map(map_weather).values

# Bar graphic-object 
data = [go.Bar(
        x = x_values,
        y = y_values,
        text = weather_percent,
        orientation = 'h',
        hoverinfo = 'y+text',
        marker = dict(
            color = 'rgb(200, 0, 200)')
        )]

# the layout
layout = go.Layout(
         title = 'Number of Traffic Accidents by Weather Condition in USA in 2015<br>' 
                    '<sub>Hover to view the details</sub>',
         xaxis = dict(
             showgrid = False,
             showticklabels = False
         ),
         autosize = False,
         margin = dict(
             autoexpand = False,
             l = 200, r = 40, pad = 5
         ),
         annotations = [
             dict(x = x, y = y,
                  text = str(x),
                  xanchor = 'left',
                  yanchor = 'middle',
                  showarrow = False) for x, y in zip(x_values, y_values)]
         )

figure = dict(data = data, layout = layout)
iplot(figure)

Number of Injured Persons by Top-10 Harmful Events

In [17]:
# accident counts grouped by harmful events
accident_harmful_events = accident_data.groupby('Harmful_event')['Case_id'].count().sort_values(ascending=False)
# total accident counts
total_accidents = accident_harmful_events.sum()

# accident counts by top-10 harmful events
accident_harmful_events_top10 = accident_harmful_events[:10]
# accident percentage by top-10 harmful events
accident_harmful_events_top10_percentage = np.round(accident_harmful_events_top10/total_accidents * 100, 2)

# mapping between harmful evenet code and text
map_harmful_events = {12:'Motor Vehicle in Transport', 8:'Pedestrian', 1:'Rollover/Overturn', 42:'Tree', 33:'Curb', 
                      34:'Ditch', 35:'Embankment', 9:'Pedalcyclist', 24:'Guardrail Face', 30:'Utility Pole/Light Support'}

# number of persons involved grouped by harmful events
persons_harmful_events_top10 = accident_data.groupby('Harmful_event')['Persons'].sum()[accident_harmful_events_top10_percentage.index]
# number of fatals grouped by harmful events
fatals_harmful_events_top10 = accident_data.groupby('Harmful_event')['Fatals'].sum()[accident_harmful_events_top10_percentage.index]

# the text for the top-10 harmful events 
harmful_event_categories = pd.Series(accident_harmful_events_top10.index).map(map_harmful_events).values

# area of the circle: accident percentage by harmful events
harmful_event_count = accident_harmful_events_top10.values
harmful_event_percent = accident_harmful_events_top10_percentage.values

# y-axis: the number of fatals by harmful events
harmful_event_fatals = fatals_harmful_events_top10.values
# log scale for y-axis
harmful_event_yaxis = np.log10(harmful_event_fatals)

# x-axis: the number of persons involved by harmful events
harmful_event_persons = persons_harmful_events_top10.values
# log-scale for x-axis
harmful_event_xaxis = np.log10(harmful_event_persons)

# the display text when clicked
harmful_event_text = []

for i in range(0, len(harmful_event_count)):
    harmful_event_text.append(harmful_event_categories[i] + ' (' + harmful_event_percent[i].astype(str) 
                       + '%)<br>' + harmful_event_fatals[i].astype(str) + ' Killed, '
                       + harmful_event_persons[i].astype(str) + ' Involved')



data = [go.Scatter(
        x = harmful_event_persons,
        y = harmful_event_fatals,
        text = harmful_event_text,
        mode = 'markers',
        hoverinfo = 'text',
        marker = dict(
            size = (harmful_event_count) / 100,
            opacity = 0.9,
            color = 'rgb(240, 140, 45)')
        )]

# layout: log scale for both x-axis and y-axis
layout = go.Layout(
         title = 'Number of Injured Persons by Top-10 Harmful Events in USA in 2015<br>' 
                    '<sub>Hover to view the details</sub>',
         xaxis = dict(
             title = 'Persons Involved',
             type = 'log',
             #range = [0.45, 3.51],
             tickmode = 'auto',
             nticks = 4,
             showline = True,
             showgrid = False
         ),
         yaxis = dict(
             title = 'Fatals',
             type = 'log',
             #range = [0.65, 3.33],
             tickmode = 'auto',
             nticks = 3,
             showline = True,
             showgrid = False)
         )

# annotation is the text below the solid circle
annotations = []
for i in range(0, 10):
    annotations.append(dict(x = harmful_event_xaxis[i], y = harmful_event_yaxis[i],
                            xanchor='middle', yanchor='top',
                            text=harmful_event_categories[i], 
                            showarrow=False
                           ))
#layout['annotations'] = annotations

figure = dict(data = data, layout = layout)
iplot(figure)

Accidents by Drunk or Sober Drivers per Date

In [16]:
# accidents per date
accident_total_perdate = accident_data.groupby('Date')['Case_id'].count().values

# accidents caused by drunk drivers 
accident_drunk_drivers = accident_data[accident_data['Drunk_drivers'] > 0]
# accidents caused by drunk drivers per date
accident_drunk_drivers_perdate = accident_drunk_drivers.groupby('Date')['Case_id'].count().values
# percentage of the accidents caused by drunk drivers per date
accident_drunk_drivers_perdate_percentage = np.round(np.divide(accident_drunk_drivers_perdate, accident_total_perdate) * 100, 1)

# accidents caused by sober drivers 
accident_sober_drivers = accident_data[accident_data['Drunk_drivers'] == 0]
# accidents caused by sober drivers per date
accident_sober_drivers_perdate = accident_sober_drivers.groupby('Date')['Case_id'].count().values
# percentage of the accidents caused by sober drivers per date
accident_sober_drivers_perdate_percentage = np.round(np.divide(accident_sober_drivers_perdate, accident_total_perdate) * 100, 1)


# numpy array of period of dates [2015-01-01, 2015-12-31]
accident_dates = np.arange('2015-01', '2016-01', dtype='datetime64[D]')

# labels
labels = ['Drunk drivers', 'Sober drivers']
# colors
colors = ['rgb(0, 0, 200)', 'rgb(200, 0, 0)']
# x-axis
x_data = accident_dates
# y-axis
y_data = np.asarray([accident_drunk_drivers_perdate_percentage, accident_sober_drivers_perdate_percentage])

traces = []
for i in range(0, 2):
    # graph-object
    traces.append(go.Scatter(
        x = x_data,
        y = y_data[i],
        mode = 'lines',
        name = labels[i],
        line = dict(color = colors[i], width = 3)
    ))

layout = go.Layout(
         title = 'Accidents by Drunk or Sober Drivers per Date in USA in 2015<br>' 
                    '<sub>Hover to view the details</sub>',
         showlegend = True,
         xaxis = dict(
             showline = True,
             showgrid = True
         ),
         yaxis = dict(
             ticksuffix = '%',
             showline = True,
             zeroline = False,
             showgrid = True,
             showticklabels = True,
            range = [0., 100]
         ),
         margin = dict(
             autoexpand = True,
             l = 127, r = 38)
         )

annotations = []
for y_trace, label in zip(y_data, labels):
    annotations.append(dict(xref='paper', x=0.0475, y=y_trace[0],
                            xanchor='right', yanchor='middle',
                            text=label + ' {}%'.format(y_trace[0]),
                            showarrow=False))
    annotations.append(dict(xref='paper', x=0.9525, y=y_trace[50],
                            xanchor='left', yanchor='middle',
                            text='{}%'.format(y_trace[50]),
                            showarrow=False))
annotations[1].update(yanchor='top')
annotations[3].update(yanchor='bottom')
#layout['annotations'] = annotations

figure = dict(data = traces, layout = layout)
iplot(figure)
In [ ]: